{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Python Tools for Record Linking and Fuzzy Matching\n",
    "\n",
    "This notebook accompanies the [article](https://pbpython.com/record-linking.html) on Practical Business Python\n",
    "\n",
    "This notebook relies on [fuzzymatcher](https://github.com/RobinL/fuzzymatcher) and the [Python Record Linkage Toolkit](https://recordlinkage.readthedocs.io/en/latest/about.html)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import fuzzymatcher\n",
    "import recordlinkage"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example using fuzzymatcher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "hospital_accounts = pd.read_csv(\n",
    "    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv'\n",
    ")\n",
    "hospital_reimbursement = pd.read_csv(\n",
    "    'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP Code</th>\n",
       "      <th>County Name</th>\n",
       "      <th>Phone Number</th>\n",
       "      <th>Hospital Type</th>\n",
       "      <th>Hospital Ownership</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10605</td>\n",
       "      <td>SAGE MEMORIAL HOSPITAL</td>\n",
       "      <td>STATE ROUTE 264 SOUTH 191</td>\n",
       "      <td>GANADO</td>\n",
       "      <td>AZ</td>\n",
       "      <td>86505</td>\n",
       "      <td>APACHE</td>\n",
       "      <td>(928) 755-4541</td>\n",
       "      <td>Critical Access Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>24250</td>\n",
       "      <td>WOODRIDGE BEHAVIORAL CENTER</td>\n",
       "      <td>600 NORTH 7TH STREET</td>\n",
       "      <td>WEST MEMPHIS</td>\n",
       "      <td>AR</td>\n",
       "      <td>72301</td>\n",
       "      <td>CRITTENDEN</td>\n",
       "      <td>(870) 394-4113</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10341</td>\n",
       "      <td>DOUGLAS GARDENS HOSPITAL</td>\n",
       "      <td>5200 NE 2ND AVE</td>\n",
       "      <td>MIAMI</td>\n",
       "      <td>FL</td>\n",
       "      <td>33137</td>\n",
       "      <td>MIAMI-DADE</td>\n",
       "      <td>(305) 751-8626</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>81101</td>\n",
       "      <td>SUNCOAST BEHAVIORAL HEALTH CENTER</td>\n",
       "      <td>4480 51ST ST W</td>\n",
       "      <td>BRADENTON</td>\n",
       "      <td>FL</td>\n",
       "      <td>34210</td>\n",
       "      <td>MANATEE</td>\n",
       "      <td>(941) 792-2222</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>39835</td>\n",
       "      <td>TREASURE VALLEY HOSPITAL</td>\n",
       "      <td>8800 WEST EMERALD STREET</td>\n",
       "      <td>BOISE</td>\n",
       "      <td>ID</td>\n",
       "      <td>83704</td>\n",
       "      <td>ADA</td>\n",
       "      <td>(208) 373-5000</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Account_Num                      Facility Name                    Address  \\\n",
       "0        10605             SAGE MEMORIAL HOSPITAL  STATE ROUTE 264 SOUTH 191   \n",
       "1        24250        WOODRIDGE BEHAVIORAL CENTER       600 NORTH 7TH STREET   \n",
       "2        10341           DOUGLAS GARDENS HOSPITAL            5200 NE 2ND AVE   \n",
       "3        81101  SUNCOAST BEHAVIORAL HEALTH CENTER             4480 51ST ST W   \n",
       "4        39835           TREASURE VALLEY HOSPITAL   8800 WEST EMERALD STREET   \n",
       "\n",
       "           City State  ZIP Code County Name    Phone Number  \\\n",
       "0        GANADO    AZ     86505      APACHE  (928) 755-4541   \n",
       "1  WEST MEMPHIS    AR     72301  CRITTENDEN  (870) 394-4113   \n",
       "2         MIAMI    FL     33137  MIAMI-DADE  (305) 751-8626   \n",
       "3     BRADENTON    FL     34210     MANATEE  (941) 792-2222   \n",
       "4         BOISE    ID     83704         ADA  (208) 373-5000   \n",
       "\n",
       "               Hospital Type              Hospital Ownership  \n",
       "0  Critical Access Hospitals  Voluntary non-profit - Private  \n",
       "1                Psychiatric                     Proprietary  \n",
       "2       Acute Care Hospitals  Voluntary non-profit - Private  \n",
       "3                Psychiatric                     Proprietary  \n",
       "4       Acute Care Hospitals                     Proprietary  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_accounts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>Provider Zip Code</th>\n",
       "      <th>Total Discharges</th>\n",
       "      <th>Average Covered Charges</th>\n",
       "      <th>Average Total Payments</th>\n",
       "      <th>Average Medicare Payments</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>839987</td>\n",
       "      <td>SOUTHEAST ALABAMA MEDICAL CENTER</td>\n",
       "      <td>1108 ROSS CLARK CIRCLE</td>\n",
       "      <td>DOTHAN</td>\n",
       "      <td>AL</td>\n",
       "      <td>36301</td>\n",
       "      <td>118</td>\n",
       "      <td>20855.61</td>\n",
       "      <td>5026.19</td>\n",
       "      <td>4115.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>519118</td>\n",
       "      <td>MARSHALL MEDICAL CENTER SOUTH</td>\n",
       "      <td>2505 U S HIGHWAY 431 NORTH</td>\n",
       "      <td>BOAZ</td>\n",
       "      <td>AL</td>\n",
       "      <td>35957</td>\n",
       "      <td>43</td>\n",
       "      <td>13289.09</td>\n",
       "      <td>5413.63</td>\n",
       "      <td>4490.93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>733073</td>\n",
       "      <td>ELIZA COFFEE MEMORIAL HOSPITAL</td>\n",
       "      <td>205 MARENGO STREET</td>\n",
       "      <td>FLORENCE</td>\n",
       "      <td>AL</td>\n",
       "      <td>35631</td>\n",
       "      <td>73</td>\n",
       "      <td>22261.60</td>\n",
       "      <td>4922.18</td>\n",
       "      <td>4021.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>201752</td>\n",
       "      <td>MIZELL MEMORIAL HOSPITAL</td>\n",
       "      <td>702 N MAIN ST</td>\n",
       "      <td>OPP</td>\n",
       "      <td>AL</td>\n",
       "      <td>36467</td>\n",
       "      <td>12</td>\n",
       "      <td>10901.33</td>\n",
       "      <td>5343.50</td>\n",
       "      <td>4284.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>678488</td>\n",
       "      <td>ST VINCENT'S EAST</td>\n",
       "      <td>50 MEDICAL PARK EAST DRIVE</td>\n",
       "      <td>BIRMINGHAM</td>\n",
       "      <td>AL</td>\n",
       "      <td>35235</td>\n",
       "      <td>74</td>\n",
       "      <td>28117.95</td>\n",
       "      <td>5947.12</td>\n",
       "      <td>4819.53</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Provider_Num                     Provider Name     Provider Street Address  \\\n",
       "0        839987  SOUTHEAST ALABAMA MEDICAL CENTER      1108 ROSS CLARK CIRCLE   \n",
       "1        519118     MARSHALL MEDICAL CENTER SOUTH  2505 U S HIGHWAY 431 NORTH   \n",
       "2        733073    ELIZA COFFEE MEMORIAL HOSPITAL          205 MARENGO STREET   \n",
       "3        201752          MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   \n",
       "4        678488                 ST VINCENT'S EAST  50 MEDICAL PARK EAST DRIVE   \n",
       "\n",
       "  Provider City Provider State  Provider Zip Code  Total Discharges  \\\n",
       "0        DOTHAN             AL              36301               118   \n",
       "1          BOAZ             AL              35957                43   \n",
       "2      FLORENCE             AL              35631                73   \n",
       "3           OPP             AL              36467                12   \n",
       "4    BIRMINGHAM             AL              35235                74   \n",
       "\n",
       "   Average Covered Charges  Average Total Payments  Average Medicare Payments  \n",
       "0                 20855.61                 5026.19                    4115.52  \n",
       "1                 13289.09                 5413.63                    4490.93  \n",
       "2                 22261.60                 4922.18                    4021.79  \n",
       "3                 10901.33                 5343.50                    4284.17  \n",
       "4                 28117.95                 5947.12                    4819.53  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_reimbursement.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Columns to match on from df_left\n",
    "left_on = [\"Facility Name\", \"Address\", \"City\", \"State\"]\n",
    "\n",
    "# Columns to match on from df_right\n",
    "right_on = [\n",
    "    \"Provider Name\", \"Provider Street Address\", \"Provider City\",\n",
    "    \"Provider State\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now perform the match\n",
    "# It will take several minutes to run on this data set\n",
    "matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,\n",
    "                                               hospital_reimbursement,\n",
    "                                               left_on,\n",
    "                                               right_on,\n",
    "                                               left_id_col='Account_Num',\n",
    "                                               right_id_col='Provider_Num')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>best_match_score</th>\n",
       "      <th>__id_left</th>\n",
       "      <th>__id_right</th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP Code</th>\n",
       "      <th>County Name</th>\n",
       "      <th>...</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>Provider Zip Code</th>\n",
       "      <th>Total Discharges</th>\n",
       "      <th>Average Covered Charges</th>\n",
       "      <th>Average Total Payments</th>\n",
       "      <th>Average Medicare Payments</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.746613</td>\n",
       "      <td>10605</td>\n",
       "      <td>643595</td>\n",
       "      <td>10605</td>\n",
       "      <td>SAGE MEMORIAL HOSPITAL</td>\n",
       "      <td>STATE ROUTE 264 SOUTH 191</td>\n",
       "      <td>GANADO</td>\n",
       "      <td>AZ</td>\n",
       "      <td>86505</td>\n",
       "      <td>APACHE</td>\n",
       "      <td>...</td>\n",
       "      <td>643595</td>\n",
       "      <td>TYLER MEMORIAL HOSPITAL</td>\n",
       "      <td>5950 STATE ROUTE 6 WEST</td>\n",
       "      <td>TUNKHANNOCK</td>\n",
       "      <td>PA</td>\n",
       "      <td>18657</td>\n",
       "      <td>18</td>\n",
       "      <td>20482.94</td>\n",
       "      <td>5783.22</td>\n",
       "      <td>4929.22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234</th>\n",
       "      <td>-0.609873</td>\n",
       "      <td>24250</td>\n",
       "      <td>426767</td>\n",
       "      <td>24250</td>\n",
       "      <td>WOODRIDGE BEHAVIORAL CENTER</td>\n",
       "      <td>600 NORTH 7TH STREET</td>\n",
       "      <td>WEST MEMPHIS</td>\n",
       "      <td>AR</td>\n",
       "      <td>72301</td>\n",
       "      <td>CRITTENDEN</td>\n",
       "      <td>...</td>\n",
       "      <td>426767</td>\n",
       "      <td>CRISP REGIONAL HOSPITAL</td>\n",
       "      <td>902 7TH STREET NORTH</td>\n",
       "      <td>CORDELE</td>\n",
       "      <td>GA</td>\n",
       "      <td>31015</td>\n",
       "      <td>18</td>\n",
       "      <td>14655.94</td>\n",
       "      <td>5680.28</td>\n",
       "      <td>4899.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>-0.404473</td>\n",
       "      <td>10341</td>\n",
       "      <td>730311</td>\n",
       "      <td>10341</td>\n",
       "      <td>DOUGLAS GARDENS HOSPITAL</td>\n",
       "      <td>5200 NE 2ND AVE</td>\n",
       "      <td>MIAMI</td>\n",
       "      <td>FL</td>\n",
       "      <td>33137</td>\n",
       "      <td>MIAMI-DADE</td>\n",
       "      <td>...</td>\n",
       "      <td>730311</td>\n",
       "      <td>JACKSON MEMORIAL HOSPITAL</td>\n",
       "      <td>1611 NW 12TH AVE</td>\n",
       "      <td>MIAMI</td>\n",
       "      <td>FL</td>\n",
       "      <td>33136</td>\n",
       "      <td>113</td>\n",
       "      <td>30021.91</td>\n",
       "      <td>14448.75</td>\n",
       "      <td>11043.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>675</th>\n",
       "      <td>-0.333020</td>\n",
       "      <td>81101</td>\n",
       "      <td>302173</td>\n",
       "      <td>81101</td>\n",
       "      <td>SUNCOAST BEHAVIORAL HEALTH CENTER</td>\n",
       "      <td>4480 51ST ST W</td>\n",
       "      <td>BRADENTON</td>\n",
       "      <td>FL</td>\n",
       "      <td>34210</td>\n",
       "      <td>MANATEE</td>\n",
       "      <td>...</td>\n",
       "      <td>302173</td>\n",
       "      <td>BLAKE MEDICAL CENTER</td>\n",
       "      <td>2020 59TH ST W</td>\n",
       "      <td>BRADENTON</td>\n",
       "      <td>FL</td>\n",
       "      <td>34209</td>\n",
       "      <td>135</td>\n",
       "      <td>46792.59</td>\n",
       "      <td>5651.22</td>\n",
       "      <td>4534.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>-0.629261</td>\n",
       "      <td>39835</td>\n",
       "      <td>416060</td>\n",
       "      <td>39835</td>\n",
       "      <td>TREASURE VALLEY HOSPITAL</td>\n",
       "      <td>8800 WEST EMERALD STREET</td>\n",
       "      <td>BOISE</td>\n",
       "      <td>ID</td>\n",
       "      <td>83704</td>\n",
       "      <td>ADA</td>\n",
       "      <td>...</td>\n",
       "      <td>416060</td>\n",
       "      <td>ST LUKE'S REGIONAL MEDICAL CENTER</td>\n",
       "      <td>190 EAST BANNOCK STREET</td>\n",
       "      <td>BOISE</td>\n",
       "      <td>ID</td>\n",
       "      <td>83712</td>\n",
       "      <td>68</td>\n",
       "      <td>26564.97</td>\n",
       "      <td>6514.51</td>\n",
       "      <td>5631.87</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     best_match_score  __id_left  __id_right  Account_Num  \\\n",
       "0           -0.746613      10605      643595        10605   \n",
       "234         -0.609873      24250      426767        24250   \n",
       "446         -0.404473      10341      730311        10341   \n",
       "675         -0.333020      81101      302173        81101   \n",
       "893         -0.629261      39835      416060        39835   \n",
       "\n",
       "                         Facility Name                    Address  \\\n",
       "0               SAGE MEMORIAL HOSPITAL  STATE ROUTE 264 SOUTH 191   \n",
       "234        WOODRIDGE BEHAVIORAL CENTER       600 NORTH 7TH STREET   \n",
       "446           DOUGLAS GARDENS HOSPITAL            5200 NE 2ND AVE   \n",
       "675  SUNCOAST BEHAVIORAL HEALTH CENTER             4480 51ST ST W   \n",
       "893           TREASURE VALLEY HOSPITAL   8800 WEST EMERALD STREET   \n",
       "\n",
       "             City State  ZIP Code County Name  ... Provider_Num  \\\n",
       "0          GANADO    AZ     86505      APACHE  ...       643595   \n",
       "234  WEST MEMPHIS    AR     72301  CRITTENDEN  ...       426767   \n",
       "446         MIAMI    FL     33137  MIAMI-DADE  ...       730311   \n",
       "675     BRADENTON    FL     34210     MANATEE  ...       302173   \n",
       "893         BOISE    ID     83704         ADA  ...       416060   \n",
       "\n",
       "                         Provider Name  Provider Street Address  \\\n",
       "0              TYLER MEMORIAL HOSPITAL  5950 STATE ROUTE 6 WEST   \n",
       "234            CRISP REGIONAL HOSPITAL     902 7TH STREET NORTH   \n",
       "446          JACKSON MEMORIAL HOSPITAL         1611 NW 12TH AVE   \n",
       "675               BLAKE MEDICAL CENTER           2020 59TH ST W   \n",
       "893  ST LUKE'S REGIONAL MEDICAL CENTER  190 EAST BANNOCK STREET   \n",
       "\n",
       "     Provider City Provider State Provider Zip Code Total Discharges  \\\n",
       "0      TUNKHANNOCK             PA             18657               18   \n",
       "234        CORDELE             GA             31015               18   \n",
       "446          MIAMI             FL             33136              113   \n",
       "675      BRADENTON             FL             34209              135   \n",
       "893          BOISE             ID             83712               68   \n",
       "\n",
       "    Average Covered Charges  Average Total Payments  Average Medicare Payments  \n",
       "0                  20482.94                 5783.22                    4929.22  \n",
       "234                14655.94                 5680.28                    4899.39  \n",
       "446                30021.91                14448.75                   11043.58  \n",
       "675                46792.59                 5651.22                    4534.90  \n",
       "893                26564.97                 6514.51                    5631.87  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matched_results.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reorder the columns to make viewing easier\n",
    "cols = [\n",
    "    \"best_match_score\", \"Facility Name\", \"Provider Name\", \"Address\", \"Provider Street Address\",\n",
    "    \"Provider City\", \"City\", \"Provider State\", \"State\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>best_match_score</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>77846</th>\n",
       "      <td>3.090931</td>\n",
       "      <td>RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION</td>\n",
       "      <td>RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION</td>\n",
       "      <td>530 NEW BRUNSWICK AVE</td>\n",
       "      <td>530 NEW BRUNSWICK AVE</td>\n",
       "      <td>PERTH AMBOY</td>\n",
       "      <td>PERTH AMBOY</td>\n",
       "      <td>NJ</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>532466</th>\n",
       "      <td>2.799072</td>\n",
       "      <td>ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL</td>\n",
       "      <td>ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL</td>\n",
       "      <td>ONE ROBERT WOOD JOHNSON PLACE</td>\n",
       "      <td>ONE ROBERT WOOD JOHNSON PLACE</td>\n",
       "      <td>NEW BRUNSWICK</td>\n",
       "      <td>NEW BRUNSWICK</td>\n",
       "      <td>NJ</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78435</th>\n",
       "      <td>2.785132</td>\n",
       "      <td>AVERA MCKENNAN HOSPITAL &amp; UNIVERSITY HEALTH CE...</td>\n",
       "      <td>AVERA MCKENNAN HOSPITAL &amp; UNIVERSITY HEALTH CE...</td>\n",
       "      <td>1325 S CLIFF AVE  POST OFFICE BOX 5045</td>\n",
       "      <td>1325 S CLIFF AVE  POST OFFICE BOX 5045</td>\n",
       "      <td>SIOUX FALLS</td>\n",
       "      <td>SIOUX FALLS</td>\n",
       "      <td>SD</td>\n",
       "      <td>SD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242697</th>\n",
       "      <td>2.778860</td>\n",
       "      <td>JOHN T MATHER MEMORIAL HOSPITAL  OF PORT JEFFE...</td>\n",
       "      <td>JOHN T MATHER MEMORIAL HOSPITAL  OF PORT JEFFE...</td>\n",
       "      <td>75 NORTH COUNTRY ROAD</td>\n",
       "      <td>75 NORTH COUNTRY ROAD</td>\n",
       "      <td>PORT JEFFERSON</td>\n",
       "      <td>PORT JEFFERSON</td>\n",
       "      <td>NY</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447347</th>\n",
       "      <td>2.721425</td>\n",
       "      <td>MAYO CLINIC HEALTH SYSTEM - RED WING</td>\n",
       "      <td>MAYO CLINIC HEALTH SYSTEM IN RED WING</td>\n",
       "      <td>701 HEWITT BOULEVARD, PO BOX 95</td>\n",
       "      <td>701 HEWITT BOULEVARD, PO BOX 95</td>\n",
       "      <td>RED WING</td>\n",
       "      <td>RED WING</td>\n",
       "      <td>MN</td>\n",
       "      <td>MN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        best_match_score                                      Facility Name  \\\n",
       "77846           3.090931    RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION   \n",
       "532466          2.799072            ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL   \n",
       "78435           2.785132  AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE...   \n",
       "242697          2.778860  JOHN T MATHER MEMORIAL HOSPITAL  OF PORT JEFFE...   \n",
       "447347          2.721425               MAYO CLINIC HEALTH SYSTEM - RED WING   \n",
       "\n",
       "                                            Provider Name  \\\n",
       "77846     RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION   \n",
       "532466            ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL   \n",
       "78435   AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE...   \n",
       "242697  JOHN T MATHER MEMORIAL HOSPITAL  OF PORT JEFFE...   \n",
       "447347              MAYO CLINIC HEALTH SYSTEM IN RED WING   \n",
       "\n",
       "                                       Address  \\\n",
       "77846                    530 NEW BRUNSWICK AVE   \n",
       "532466           ONE ROBERT WOOD JOHNSON PLACE   \n",
       "78435   1325 S CLIFF AVE  POST OFFICE BOX 5045   \n",
       "242697                   75 NORTH COUNTRY ROAD   \n",
       "447347         701 HEWITT BOULEVARD, PO BOX 95   \n",
       "\n",
       "                       Provider Street Address   Provider City  \\\n",
       "77846                    530 NEW BRUNSWICK AVE     PERTH AMBOY   \n",
       "532466           ONE ROBERT WOOD JOHNSON PLACE   NEW BRUNSWICK   \n",
       "78435   1325 S CLIFF AVE  POST OFFICE BOX 5045     SIOUX FALLS   \n",
       "242697                   75 NORTH COUNTRY ROAD  PORT JEFFERSON   \n",
       "447347         701 HEWITT BOULEVARD, PO BOX 95        RED WING   \n",
       "\n",
       "                  City Provider State State  \n",
       "77846      PERTH AMBOY             NJ    NJ  \n",
       "532466   NEW BRUNSWICK             NJ    NJ  \n",
       "78435      SIOUX FALLS             SD    SD  \n",
       "242697  PORT JEFFERSON             NY    NY  \n",
       "447347        RED WING             MN    MN  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's see the best matches\n",
    "matched_results[cols].sort_values(by=['best_match_score'], ascending=False).head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>best_match_score</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>426256</th>\n",
       "      <td>-2.268231</td>\n",
       "      <td>CENTRO MEDICO WILMA N VAZQUEZ</td>\n",
       "      <td>BAPTIST MEDICAL CENTER EAST</td>\n",
       "      <td>CARR. 2 KM 39.5    ROAD NUMBER 2 BO ALGARROBO</td>\n",
       "      <td>400 TAYLOR ROAD</td>\n",
       "      <td>MONTGOMERY</td>\n",
       "      <td>VEGA BAJA</td>\n",
       "      <td>AL</td>\n",
       "      <td>PR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83051</th>\n",
       "      <td>-2.124071</td>\n",
       "      <td>DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO...</td>\n",
       "      <td>OVERLAKE HOSPITAL MEDICAL CENTER</td>\n",
       "      <td>EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P...</td>\n",
       "      <td>1035-116TH AVE NE</td>\n",
       "      <td>BELLEVUE</td>\n",
       "      <td>CAROLINA</td>\n",
       "      <td>WA</td>\n",
       "      <td>PR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42613</th>\n",
       "      <td>-2.106746</td>\n",
       "      <td>HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ</td>\n",
       "      <td>SCRIPPS MERCY HOSPITAL</td>\n",
       "      <td>BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO...</td>\n",
       "      <td>4077 5TH AVE</td>\n",
       "      <td>SAN DIEGO</td>\n",
       "      <td>SAN JUAN</td>\n",
       "      <td>CA</td>\n",
       "      <td>PR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>450232</th>\n",
       "      <td>-2.050888</td>\n",
       "      <td>CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA</td>\n",
       "      <td>MILFORD REGIONAL MEDICAL CENTER</td>\n",
       "      <td>CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT...</td>\n",
       "      <td>14 PROSPECT STREET</td>\n",
       "      <td>MILFORD</td>\n",
       "      <td>AIBONITO</td>\n",
       "      <td>MA</td>\n",
       "      <td>PR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>476086</th>\n",
       "      <td>-1.996508</td>\n",
       "      <td>ADMIN DE SERVICIOS MEDICOS  PUERTO RIC</td>\n",
       "      <td>MAINE MEDICAL CENTER</td>\n",
       "      <td>BO MONACILLO CARR NUM 22</td>\n",
       "      <td>22 BRAMHALL ST</td>\n",
       "      <td>PORTLAND</td>\n",
       "      <td>SAN JUAN</td>\n",
       "      <td>ME</td>\n",
       "      <td>PR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        best_match_score                                      Facility Name  \\\n",
       "426256         -2.268231                      CENTRO MEDICO WILMA N VAZQUEZ   \n",
       "83051          -2.124071  DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO...   \n",
       "42613          -2.106746     HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ   \n",
       "450232         -2.050888           CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA   \n",
       "476086         -1.996508             ADMIN DE SERVICIOS MEDICOS  PUERTO RIC   \n",
       "\n",
       "                           Provider Name  \\\n",
       "426256       BAPTIST MEDICAL CENTER EAST   \n",
       "83051   OVERLAKE HOSPITAL MEDICAL CENTER   \n",
       "42613             SCRIPPS MERCY HOSPITAL   \n",
       "450232   MILFORD REGIONAL MEDICAL CENTER   \n",
       "476086              MAINE MEDICAL CENTER   \n",
       "\n",
       "                                                  Address  \\\n",
       "426256      CARR. 2 KM 39.5    ROAD NUMBER 2 BO ALGARROBO   \n",
       "83051   EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P...   \n",
       "42613   BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO...   \n",
       "450232  CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT...   \n",
       "476086                           BO MONACILLO CARR NUM 22   \n",
       "\n",
       "       Provider Street Address Provider City       City Provider State State  \n",
       "426256         400 TAYLOR ROAD    MONTGOMERY  VEGA BAJA             AL    PR  \n",
       "83051        1035-116TH AVE NE      BELLEVUE   CAROLINA             WA    PR  \n",
       "42613             4077 5TH AVE     SAN DIEGO   SAN JUAN             CA    PR  \n",
       "450232      14 PROSPECT STREET       MILFORD   AIBONITO             MA    PR  \n",
       "476086          22 BRAMHALL ST      PORTLAND   SAN JUAN             ME    PR  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now the worst matches\n",
    "matched_results[cols].sort_values(by=['best_match_score'],\n",
    "                                  ascending=True).head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>best_match_score</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>413779</th>\n",
       "      <td>0.999843</td>\n",
       "      <td>PRISMA HEALTH BAPTIST</td>\n",
       "      <td>PALMETTO HEALTH BAPTIST</td>\n",
       "      <td>1330 TAYLOR AT MARION ST</td>\n",
       "      <td>TAYLOR AT MARION ST</td>\n",
       "      <td>COLUMBIA</td>\n",
       "      <td>COLUMBIA</td>\n",
       "      <td>SC</td>\n",
       "      <td>SC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148813</th>\n",
       "      <td>0.999724</td>\n",
       "      <td>CHRIST HOSPITAL</td>\n",
       "      <td>CHRIST HOSPITAL</td>\n",
       "      <td>2139 AUBURN AVENUE</td>\n",
       "      <td>2139 AUBURN AVENUE</td>\n",
       "      <td>CINCINNATI</td>\n",
       "      <td>CINCINNATI</td>\n",
       "      <td>OH</td>\n",
       "      <td>OH</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>387796</th>\n",
       "      <td>0.997759</td>\n",
       "      <td>RUSSELL COUNTY HOSPITAL</td>\n",
       "      <td>RUSSELL COUNTY MEDICAL CENTER</td>\n",
       "      <td>58 CARROLL STREET</td>\n",
       "      <td>58 CARROLL STREET</td>\n",
       "      <td>LEBANON</td>\n",
       "      <td>LEBANON</td>\n",
       "      <td>VA</td>\n",
       "      <td>VA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>377611</th>\n",
       "      <td>0.993306</td>\n",
       "      <td>SKY RIDGE MEDICAL CENTER</td>\n",
       "      <td>SKY RIDGE MEDICAL CENTER</td>\n",
       "      <td>10101 RIDGEGATE PARKWAY</td>\n",
       "      <td>10101 RIDGE GATE PARKWAY</td>\n",
       "      <td>LONE TREE</td>\n",
       "      <td>LONE TREE</td>\n",
       "      <td>CO</td>\n",
       "      <td>CO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>303356</th>\n",
       "      <td>0.991168</td>\n",
       "      <td>HIGHLAND HOSPITAL</td>\n",
       "      <td>HIGHLAND HOSPITAL</td>\n",
       "      <td>1000 SOUTH AVENUE</td>\n",
       "      <td>1000 SOUTH AVENUE</td>\n",
       "      <td>ROCHESTER</td>\n",
       "      <td>ROCHESTER</td>\n",
       "      <td>NY</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>538105</th>\n",
       "      <td>0.990481</td>\n",
       "      <td>PAOLI HOSPITAL</td>\n",
       "      <td>MAIN LINE HOSPITAL PAOLI</td>\n",
       "      <td>255 WEST LANCASTER AVENUE</td>\n",
       "      <td>255 WEST LANCASTER AVENUE</td>\n",
       "      <td>PAOLI</td>\n",
       "      <td>PAOLI</td>\n",
       "      <td>PA</td>\n",
       "      <td>PA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>251502</th>\n",
       "      <td>0.986695</td>\n",
       "      <td>NYU WINTHROP HOSPITAL</td>\n",
       "      <td>WINTHROP-UNIVERSITY HOSPITAL</td>\n",
       "      <td>259 FIRST STREET</td>\n",
       "      <td>259 FIRST STREET</td>\n",
       "      <td>MINEOLA</td>\n",
       "      <td>MINEOLA</td>\n",
       "      <td>NY</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>310674</th>\n",
       "      <td>0.986440</td>\n",
       "      <td>ADVENTHEALTH GORDON</td>\n",
       "      <td>GORDON HOSPITAL</td>\n",
       "      <td>1035 RED BUD ROAD</td>\n",
       "      <td>1035 RED BUD ROAD</td>\n",
       "      <td>CALHOUN</td>\n",
       "      <td>CALHOUN</td>\n",
       "      <td>GA</td>\n",
       "      <td>GA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203931</th>\n",
       "      <td>0.985763</td>\n",
       "      <td>MEMORIAL MEDICAL CENTER</td>\n",
       "      <td>MEMORIAL MEDICAL CENTER</td>\n",
       "      <td>701 N FIRST ST</td>\n",
       "      <td>701 N FIRST ST</td>\n",
       "      <td>SPRINGFIELD</td>\n",
       "      <td>SPRINGFIELD</td>\n",
       "      <td>IL</td>\n",
       "      <td>IL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102187</th>\n",
       "      <td>0.984978</td>\n",
       "      <td>HENDRICK MEDICAL CENTER</td>\n",
       "      <td>HENDRICK MEDICAL CENTER</td>\n",
       "      <td>1900 PINE</td>\n",
       "      <td>1900 PINE</td>\n",
       "      <td>ABILENE</td>\n",
       "      <td>ABILENE</td>\n",
       "      <td>TX</td>\n",
       "      <td>TX</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        best_match_score             Facility Name  \\\n",
       "413779          0.999843     PRISMA HEALTH BAPTIST   \n",
       "148813          0.999724           CHRIST HOSPITAL   \n",
       "387796          0.997759   RUSSELL COUNTY HOSPITAL   \n",
       "377611          0.993306  SKY RIDGE MEDICAL CENTER   \n",
       "303356          0.991168         HIGHLAND HOSPITAL   \n",
       "538105          0.990481            PAOLI HOSPITAL   \n",
       "251502          0.986695     NYU WINTHROP HOSPITAL   \n",
       "310674          0.986440       ADVENTHEALTH GORDON   \n",
       "203931          0.985763   MEMORIAL MEDICAL CENTER   \n",
       "102187          0.984978   HENDRICK MEDICAL CENTER   \n",
       "\n",
       "                        Provider Name                    Address  \\\n",
       "413779        PALMETTO HEALTH BAPTIST   1330 TAYLOR AT MARION ST   \n",
       "148813                CHRIST HOSPITAL         2139 AUBURN AVENUE   \n",
       "387796  RUSSELL COUNTY MEDICAL CENTER          58 CARROLL STREET   \n",
       "377611       SKY RIDGE MEDICAL CENTER    10101 RIDGEGATE PARKWAY   \n",
       "303356              HIGHLAND HOSPITAL          1000 SOUTH AVENUE   \n",
       "538105       MAIN LINE HOSPITAL PAOLI  255 WEST LANCASTER AVENUE   \n",
       "251502   WINTHROP-UNIVERSITY HOSPITAL           259 FIRST STREET   \n",
       "310674                GORDON HOSPITAL          1035 RED BUD ROAD   \n",
       "203931        MEMORIAL MEDICAL CENTER             701 N FIRST ST   \n",
       "102187        HENDRICK MEDICAL CENTER                  1900 PINE   \n",
       "\n",
       "          Provider Street Address Provider City         City Provider State  \\\n",
       "413779        TAYLOR AT MARION ST      COLUMBIA     COLUMBIA             SC   \n",
       "148813         2139 AUBURN AVENUE    CINCINNATI   CINCINNATI             OH   \n",
       "387796          58 CARROLL STREET       LEBANON      LEBANON             VA   \n",
       "377611   10101 RIDGE GATE PARKWAY     LONE TREE    LONE TREE             CO   \n",
       "303356          1000 SOUTH AVENUE     ROCHESTER    ROCHESTER             NY   \n",
       "538105  255 WEST LANCASTER AVENUE         PAOLI        PAOLI             PA   \n",
       "251502           259 FIRST STREET       MINEOLA      MINEOLA             NY   \n",
       "310674          1035 RED BUD ROAD       CALHOUN      CALHOUN             GA   \n",
       "203931             701 N FIRST ST   SPRINGFIELD  SPRINGFIELD             IL   \n",
       "102187                  1900 PINE       ABILENE      ABILENE             TX   \n",
       "\n",
       "       State  \n",
       "413779    SC  \n",
       "148813    OH  \n",
       "387796    VA  \n",
       "377611    CO  \n",
       "303356    NY  \n",
       "538105    PA  \n",
       "251502    NY  \n",
       "310674    GA  \n",
       "203931    IL  \n",
       "102187    TX  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Look at the matches around 1\n",
    "matched_results[cols].query(\"best_match_score <= 1\").sort_values(\n",
    "    by=['best_match_score'], ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>best_match_score</th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>518066</th>\n",
       "      <td>0.792471</td>\n",
       "      <td>METHODIST HOSPITAL SOUTH</td>\n",
       "      <td>SOUTH TEXAS REGIONAL MEDICAL CENTER</td>\n",
       "      <td>1905 HWY 97 EAST</td>\n",
       "      <td>1905 HWY 97 EAST</td>\n",
       "      <td>JOURDANTON</td>\n",
       "      <td>JOURDANTON</td>\n",
       "      <td>TX</td>\n",
       "      <td>TX</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>416905</th>\n",
       "      <td>0.791668</td>\n",
       "      <td>ADVENTIST HEALTH UKIAH VALLEY</td>\n",
       "      <td>UKIAH VALLEY MEDICAL CENTER</td>\n",
       "      <td>275 HOSPITAL DRIVE</td>\n",
       "      <td>275 HOSPITAL DRIVE</td>\n",
       "      <td>UKIAH</td>\n",
       "      <td>UKIAH</td>\n",
       "      <td>CA</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>303110</th>\n",
       "      <td>0.787163</td>\n",
       "      <td>MADISON HEALTH</td>\n",
       "      <td>MADISON COUNTY HOSPITAL INC</td>\n",
       "      <td>210 NORTH MAIN STREET</td>\n",
       "      <td>210 NORTH MAIN STREET</td>\n",
       "      <td>LONDON</td>\n",
       "      <td>LONDON</td>\n",
       "      <td>OH</td>\n",
       "      <td>OH</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>388140</th>\n",
       "      <td>0.776632</td>\n",
       "      <td>PENN HIGHLANDS CLEARFIELD</td>\n",
       "      <td>CLEARFIELD HOSPITAL</td>\n",
       "      <td>809 TURNPIKE AVE</td>\n",
       "      <td>809 TURNPIKE AVE</td>\n",
       "      <td>CLEARFIELD</td>\n",
       "      <td>CLEARFIELD</td>\n",
       "      <td>PA</td>\n",
       "      <td>PA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>492811</th>\n",
       "      <td>0.775573</td>\n",
       "      <td>MEMORIAL HOSPITAL AT GULFPORT</td>\n",
       "      <td>MEMORIAL HOSPITAL AT GULFPORT</td>\n",
       "      <td>4500 13TH STREET</td>\n",
       "      <td>4500 13TH ST-P O BOX 1810</td>\n",
       "      <td>GULFPORT</td>\n",
       "      <td>GULFPORT</td>\n",
       "      <td>MS</td>\n",
       "      <td>MS</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        best_match_score                  Facility Name  \\\n",
       "518066          0.792471       METHODIST HOSPITAL SOUTH   \n",
       "416905          0.791668  ADVENTIST HEALTH UKIAH VALLEY   \n",
       "303110          0.787163                 MADISON HEALTH   \n",
       "388140          0.776632      PENN HIGHLANDS CLEARFIELD   \n",
       "492811          0.775573  MEMORIAL HOSPITAL AT GULFPORT   \n",
       "\n",
       "                              Provider Name                Address  \\\n",
       "518066  SOUTH TEXAS REGIONAL MEDICAL CENTER       1905 HWY 97 EAST   \n",
       "416905          UKIAH VALLEY MEDICAL CENTER     275 HOSPITAL DRIVE   \n",
       "303110          MADISON COUNTY HOSPITAL INC  210 NORTH MAIN STREET   \n",
       "388140                  CLEARFIELD HOSPITAL       809 TURNPIKE AVE   \n",
       "492811        MEMORIAL HOSPITAL AT GULFPORT       4500 13TH STREET   \n",
       "\n",
       "          Provider Street Address Provider City        City Provider State  \\\n",
       "518066           1905 HWY 97 EAST    JOURDANTON  JOURDANTON             TX   \n",
       "416905         275 HOSPITAL DRIVE         UKIAH       UKIAH             CA   \n",
       "303110      210 NORTH MAIN STREET        LONDON      LONDON             OH   \n",
       "388140           809 TURNPIKE AVE    CLEARFIELD  CLEARFIELD             PA   \n",
       "492811  4500 13TH ST-P O BOX 1810      GULFPORT    GULFPORT             MS   \n",
       "\n",
       "       State  \n",
       "518066    TX  \n",
       "416905    CA  \n",
       "303110    OH  \n",
       "388140    PA  \n",
       "492811    MS  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matched_results[cols].query(\"best_match_score <= .80\").sort_values(\n",
    "    by=['best_match_score'], ascending=False).head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example using Python Record Linkage Toolkit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Re-read in the data using the index_col\n",
    "hospital_accounts = pd.read_csv(\n",
    "    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv',\n",
    "    index_col='Account_Num'\n",
    ")\n",
    "hospital_reimbursement = pd.read_csv(\n",
    "    'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv',\n",
    "    index_col='Provider_Num'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP Code</th>\n",
       "      <th>County Name</th>\n",
       "      <th>Phone Number</th>\n",
       "      <th>Hospital Type</th>\n",
       "      <th>Hospital Ownership</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Account_Num</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10605</th>\n",
       "      <td>SAGE MEMORIAL HOSPITAL</td>\n",
       "      <td>STATE ROUTE 264 SOUTH 191</td>\n",
       "      <td>GANADO</td>\n",
       "      <td>AZ</td>\n",
       "      <td>86505</td>\n",
       "      <td>APACHE</td>\n",
       "      <td>(928) 755-4541</td>\n",
       "      <td>Critical Access Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24250</th>\n",
       "      <td>WOODRIDGE BEHAVIORAL CENTER</td>\n",
       "      <td>600 NORTH 7TH STREET</td>\n",
       "      <td>WEST MEMPHIS</td>\n",
       "      <td>AR</td>\n",
       "      <td>72301</td>\n",
       "      <td>CRITTENDEN</td>\n",
       "      <td>(870) 394-4113</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10341</th>\n",
       "      <td>DOUGLAS GARDENS HOSPITAL</td>\n",
       "      <td>5200 NE 2ND AVE</td>\n",
       "      <td>MIAMI</td>\n",
       "      <td>FL</td>\n",
       "      <td>33137</td>\n",
       "      <td>MIAMI-DADE</td>\n",
       "      <td>(305) 751-8626</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81101</th>\n",
       "      <td>SUNCOAST BEHAVIORAL HEALTH CENTER</td>\n",
       "      <td>4480 51ST ST W</td>\n",
       "      <td>BRADENTON</td>\n",
       "      <td>FL</td>\n",
       "      <td>34210</td>\n",
       "      <td>MANATEE</td>\n",
       "      <td>(941) 792-2222</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39835</th>\n",
       "      <td>TREASURE VALLEY HOSPITAL</td>\n",
       "      <td>8800 WEST EMERALD STREET</td>\n",
       "      <td>BOISE</td>\n",
       "      <td>ID</td>\n",
       "      <td>83704</td>\n",
       "      <td>ADA</td>\n",
       "      <td>(208) 373-5000</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 Facility Name                    Address  \\\n",
       "Account_Num                                                                 \n",
       "10605                   SAGE MEMORIAL HOSPITAL  STATE ROUTE 264 SOUTH 191   \n",
       "24250              WOODRIDGE BEHAVIORAL CENTER       600 NORTH 7TH STREET   \n",
       "10341                 DOUGLAS GARDENS HOSPITAL            5200 NE 2ND AVE   \n",
       "81101        SUNCOAST BEHAVIORAL HEALTH CENTER             4480 51ST ST W   \n",
       "39835                 TREASURE VALLEY HOSPITAL   8800 WEST EMERALD STREET   \n",
       "\n",
       "                     City State  ZIP Code County Name    Phone Number  \\\n",
       "Account_Num                                                             \n",
       "10605              GANADO    AZ     86505      APACHE  (928) 755-4541   \n",
       "24250        WEST MEMPHIS    AR     72301  CRITTENDEN  (870) 394-4113   \n",
       "10341               MIAMI    FL     33137  MIAMI-DADE  (305) 751-8626   \n",
       "81101           BRADENTON    FL     34210     MANATEE  (941) 792-2222   \n",
       "39835               BOISE    ID     83704         ADA  (208) 373-5000   \n",
       "\n",
       "                         Hospital Type              Hospital Ownership  \n",
       "Account_Num                                                             \n",
       "10605        Critical Access Hospitals  Voluntary non-profit - Private  \n",
       "24250                      Psychiatric                     Proprietary  \n",
       "10341             Acute Care Hospitals  Voluntary non-profit - Private  \n",
       "81101                      Psychiatric                     Proprietary  \n",
       "39835             Acute Care Hospitals                     Proprietary  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_accounts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Provider Name</th>\n",
       "      <th>Provider Street Address</th>\n",
       "      <th>Provider City</th>\n",
       "      <th>Provider State</th>\n",
       "      <th>Provider Zip Code</th>\n",
       "      <th>Total Discharges</th>\n",
       "      <th>Average Covered Charges</th>\n",
       "      <th>Average Total Payments</th>\n",
       "      <th>Average Medicare Payments</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Provider_Num</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>839987</th>\n",
       "      <td>SOUTHEAST ALABAMA MEDICAL CENTER</td>\n",
       "      <td>1108 ROSS CLARK CIRCLE</td>\n",
       "      <td>DOTHAN</td>\n",
       "      <td>AL</td>\n",
       "      <td>36301</td>\n",
       "      <td>118</td>\n",
       "      <td>20855.61</td>\n",
       "      <td>5026.19</td>\n",
       "      <td>4115.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>519118</th>\n",
       "      <td>MARSHALL MEDICAL CENTER SOUTH</td>\n",
       "      <td>2505 U S HIGHWAY 431 NORTH</td>\n",
       "      <td>BOAZ</td>\n",
       "      <td>AL</td>\n",
       "      <td>35957</td>\n",
       "      <td>43</td>\n",
       "      <td>13289.09</td>\n",
       "      <td>5413.63</td>\n",
       "      <td>4490.93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733073</th>\n",
       "      <td>ELIZA COFFEE MEMORIAL HOSPITAL</td>\n",
       "      <td>205 MARENGO STREET</td>\n",
       "      <td>FLORENCE</td>\n",
       "      <td>AL</td>\n",
       "      <td>35631</td>\n",
       "      <td>73</td>\n",
       "      <td>22261.60</td>\n",
       "      <td>4922.18</td>\n",
       "      <td>4021.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201752</th>\n",
       "      <td>MIZELL MEMORIAL HOSPITAL</td>\n",
       "      <td>702 N MAIN ST</td>\n",
       "      <td>OPP</td>\n",
       "      <td>AL</td>\n",
       "      <td>36467</td>\n",
       "      <td>12</td>\n",
       "      <td>10901.33</td>\n",
       "      <td>5343.50</td>\n",
       "      <td>4284.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>678488</th>\n",
       "      <td>ST VINCENT'S EAST</td>\n",
       "      <td>50 MEDICAL PARK EAST DRIVE</td>\n",
       "      <td>BIRMINGHAM</td>\n",
       "      <td>AL</td>\n",
       "      <td>35235</td>\n",
       "      <td>74</td>\n",
       "      <td>28117.95</td>\n",
       "      <td>5947.12</td>\n",
       "      <td>4819.53</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 Provider Name     Provider Street Address  \\\n",
       "Provider_Num                                                                 \n",
       "839987        SOUTHEAST ALABAMA MEDICAL CENTER      1108 ROSS CLARK CIRCLE   \n",
       "519118           MARSHALL MEDICAL CENTER SOUTH  2505 U S HIGHWAY 431 NORTH   \n",
       "733073          ELIZA COFFEE MEMORIAL HOSPITAL          205 MARENGO STREET   \n",
       "201752                MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   \n",
       "678488                       ST VINCENT'S EAST  50 MEDICAL PARK EAST DRIVE   \n",
       "\n",
       "             Provider City Provider State  Provider Zip Code  \\\n",
       "Provider_Num                                                   \n",
       "839987              DOTHAN             AL              36301   \n",
       "519118                BOAZ             AL              35957   \n",
       "733073            FLORENCE             AL              35631   \n",
       "201752                 OPP             AL              36467   \n",
       "678488          BIRMINGHAM             AL              35235   \n",
       "\n",
       "              Total Discharges  Average Covered Charges  \\\n",
       "Provider_Num                                              \n",
       "839987                     118                 20855.61   \n",
       "519118                      43                 13289.09   \n",
       "733073                      73                 22261.60   \n",
       "201752                      12                 10901.33   \n",
       "678488                      74                 28117.95   \n",
       "\n",
       "              Average Total Payments  Average Medicare Payments  \n",
       "Provider_Num                                                     \n",
       "839987                       5026.19                    4115.52  \n",
       "519118                       5413.63                    4490.93  \n",
       "733073                       4922.18                    4021.79  \n",
       "201752                       5343.50                    4284.17  \n",
       "678488                       5947.12                    4819.53  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_reimbursement.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Index>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Build the indexer\n",
    "indexer = recordlinkage.Index()\n",
    "# Can use full or block\n",
    "#indexer.full()\n",
    "#indexer.block(left_on='State', right_on='Provider State')\n",
    "\n",
    "# Use sortedneighbor as a good option if data is not clean\n",
    "indexer.sortedneighbourhood(left_on='State', right_on='Provider State')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "candidates = indexer.index(hospital_accounts, hospital_reimbursement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "998860\n"
     ]
    }
   ],
   "source": [
    "# Let's see how many matches we want to do\n",
    "print(len(candidates))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Takes 3 minutes using the full index.\n",
    "# 14s using sorted neighbor\n",
    "# 7s using blocking\n",
    "compare = recordlinkage.Compare()\n",
    "compare.exact('City', 'Provider City', label='City')\n",
    "compare.string('Facility Name',\n",
    "               'Provider Name',\n",
    "               threshold=0.85,\n",
    "               label='Hosp_Name')\n",
    "compare.string('Address',\n",
    "               'Provider Street Address',\n",
    "               method='jarowinkler',\n",
    "               threshold=0.85,\n",
    "               label='Hosp_Address')\n",
    "features = compare.compute(candidates, hospital_accounts,\n",
    "                           hospital_reimbursement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Hosp_Name</th>\n",
       "      <th>Hosp_Address</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">10605</th>\n",
       "      <th>537184</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>803181</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>450616</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>854377</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>560361</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">70226</th>\n",
       "      <th>815904</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>746090</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193062</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>834984</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>365095</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>998860 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                          City  Hosp_Name  Hosp_Address\n",
       "Account_Num Provider_Num                               \n",
       "10605       537184           0        0.0           0.0\n",
       "            803181           0        0.0           0.0\n",
       "            450616           0        0.0           0.0\n",
       "            854377           0        0.0           0.0\n",
       "            560361           0        0.0           0.0\n",
       "...                        ...        ...           ...\n",
       "70226       815904           0        0.0           0.0\n",
       "            746090           0        0.0           0.0\n",
       "            193062           0        0.0           0.0\n",
       "            834984           0        0.0           0.0\n",
       "            365095           0        0.0           0.0\n",
       "\n",
       "[998860 rows x 3 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.0      2285\n",
       "2.0       451\n",
       "1.0      7937\n",
       "0.0    988187\n",
       "dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the score totals?\n",
    "features.sum(axis=1).value_counts().sort_index(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the potential matches\n",
    "potential_matches = features[features.sum(axis=1) > 1].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>City</th>\n",
       "      <th>Hosp_Name</th>\n",
       "      <th>Hosp_Address</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>51216</td>\n",
       "      <td>268781</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>55272</td>\n",
       "      <td>556917</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>87807</td>\n",
       "      <td>854637</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>51151</td>\n",
       "      <td>783146</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11740</td>\n",
       "      <td>260374</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Account_Num  Provider_Num  City  Hosp_Name  Hosp_Address  Score\n",
       "0        51216        268781     0        1.0           1.0    2.0\n",
       "1        55272        556917     1        1.0           1.0    3.0\n",
       "2        87807        854637     1        1.0           1.0    3.0\n",
       "3        51151        783146     1        0.0           1.0    2.0\n",
       "4        11740        260374     1        1.0           1.0    3.0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "potential_matches['Score'] = potential_matches.loc[:, 'City':'Hosp_Address'].sum(axis=1)\n",
    "potential_matches.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Facility Name             ST FRANCIS MEDICAL CENTER\n",
       "Address                       2400 ST FRANCIS DRIVE\n",
       "City                                   BRECKENRIDGE\n",
       "State                                            MN\n",
       "ZIP Code                                      56520\n",
       "County Name                                  WILKIN\n",
       "Phone Number                         (218) 643-3000\n",
       "Hospital Type             Critical Access Hospitals\n",
       "Hospital Ownership    Voluntary non-profit - Church\n",
       "Name: 51216, dtype: object"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_accounts.loc[51216,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Provider Name                SAINT FRANCIS MEDICAL CENTER\n",
       "Provider Street Address                 211 ST FRANCIS DR\n",
       "Provider City                              CAPE GIRARDEAU\n",
       "Provider State                                         MO\n",
       "Provider Zip Code                                   63703\n",
       "Total Discharges                                      141\n",
       "Average Covered Charges                           42515.1\n",
       "Average Total Payments                            5902.24\n",
       "Average Medicare Payments                         4993.43\n",
       "Name: 268781, dtype: object"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_reimbursement.loc[268781,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add some convenience columns for comparing data\n",
    "hospital_accounts['Acct_Name_Lookup'] = hospital_accounts[[\n",
    "    'Facility Name', 'Address', 'City', 'State'\n",
    "]].apply(lambda x: '_'.join(x), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "hospital_reimbursement['Reimbursement_Name_Lookup'] = hospital_reimbursement[[\n",
    "    'Provider Name', 'Provider Street Address', 'Provider City',\n",
    "    'Provider State'\n",
    "]].apply(lambda x: '_'.join(x), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "reimbursement_lookup = hospital_reimbursement[['Reimbursement_Name_Lookup']].reset_index()\n",
    "account_lookup = hospital_accounts[['Acct_Name_Lookup']].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Acct_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10605</td>\n",
       "      <td>SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>24250</td>\n",
       "      <td>WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10341</td>\n",
       "      <td>DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>81101</td>\n",
       "      <td>SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>39835</td>\n",
       "      <td>TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Account_Num                                   Acct_Name_Lookup\n",
       "0        10605  SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...\n",
       "1        24250  WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...\n",
       "2        10341  DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL\n",
       "3        81101  SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...\n",
       "4        39835  TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR..."
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "account_lookup.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Reimbursement_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>839987</td>\n",
       "      <td>SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>519118</td>\n",
       "      <td>MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>733073</td>\n",
       "      <td>ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>201752</td>\n",
       "      <td>MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>678488</td>\n",
       "      <td>ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Provider_Num                          Reimbursement_Name_Lookup\n",
       "0        839987  SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n",
       "1        519118  MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n",
       "2        733073  ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n",
       "3        201752      MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n",
       "4        678488  ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..."
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reimbursement_lookup.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "account_merge = potential_matches.merge(account_lookup, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>City</th>\n",
       "      <th>Hosp_Name</th>\n",
       "      <th>Hosp_Address</th>\n",
       "      <th>Score</th>\n",
       "      <th>Acct_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>51216</td>\n",
       "      <td>268781</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>55272</td>\n",
       "      <td>556917</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>87807</td>\n",
       "      <td>854637</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>51151</td>\n",
       "      <td>783146</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11740</td>\n",
       "      <td>260374</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Account_Num  Provider_Num  City  Hosp_Name  Hosp_Address  Score  \\\n",
       "0        51216        268781     0        1.0           1.0    2.0   \n",
       "1        55272        556917     1        1.0           1.0    3.0   \n",
       "2        87807        854637     1        1.0           1.0    3.0   \n",
       "3        51151        783146     1        0.0           1.0    2.0   \n",
       "4        11740        260374     1        1.0           1.0    3.0   \n",
       "\n",
       "                                    Acct_Name_Lookup  \n",
       "0  ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV...  \n",
       "1  SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB...  \n",
       "2  ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O...  \n",
       "3  ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V...  \n",
       "4  SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200...  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "account_merge.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Reimbursement_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>839987</td>\n",
       "      <td>SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>519118</td>\n",
       "      <td>MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>733073</td>\n",
       "      <td>ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>201752</td>\n",
       "      <td>MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>678488</td>\n",
       "      <td>ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Provider_Num                          Reimbursement_Name_Lookup\n",
       "0        839987  SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n",
       "1        519118  MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n",
       "2        733073  ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n",
       "3        201752      MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n",
       "4        678488  ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..."
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reimbursement_lookup.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's build a dataframe to  compare\n",
    "final_merge = account_merge.merge(reimbursement_lookup, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [\n",
    "    'Account_Num', 'Provider_Num', 'Score', 'Acct_Name_Lookup',\n",
    "    'Reimbursement_Name_Lookup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Score</th>\n",
       "      <th>Acct_Name_Lookup</th>\n",
       "      <th>Reimbursement_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2660</th>\n",
       "      <td>94995</td>\n",
       "      <td>825914</td>\n",
       "      <td>3.0</td>\n",
       "      <td>CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...</td>\n",
       "      <td>CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1975</th>\n",
       "      <td>94953</td>\n",
       "      <td>819181</td>\n",
       "      <td>3.0</td>\n",
       "      <td>LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...</td>\n",
       "      <td>LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1042</th>\n",
       "      <td>94943</td>\n",
       "      <td>680596</td>\n",
       "      <td>3.0</td>\n",
       "      <td>VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...</td>\n",
       "      <td>VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2305</th>\n",
       "      <td>94923</td>\n",
       "      <td>403151</td>\n",
       "      <td>3.0</td>\n",
       "      <td>UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...</td>\n",
       "      <td>UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2512</th>\n",
       "      <td>94887</td>\n",
       "      <td>752284</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI...</td>\n",
       "      <td>NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2080</th>\n",
       "      <td>10165</td>\n",
       "      <td>188247</td>\n",
       "      <td>3.0</td>\n",
       "      <td>UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT</td>\n",
       "      <td>UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1825</th>\n",
       "      <td>10090</td>\n",
       "      <td>212069</td>\n",
       "      <td>3.0</td>\n",
       "      <td>CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...</td>\n",
       "      <td>CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2424</th>\n",
       "      <td>10043</td>\n",
       "      <td>140535</td>\n",
       "      <td>3.0</td>\n",
       "      <td>BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...</td>\n",
       "      <td>BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1959</th>\n",
       "      <td>10020</td>\n",
       "      <td>210657</td>\n",
       "      <td>3.0</td>\n",
       "      <td>ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...</td>\n",
       "      <td>ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1958</th>\n",
       "      <td>10020</td>\n",
       "      <td>121670</td>\n",
       "      <td>2.0</td>\n",
       "      <td>ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...</td>\n",
       "      <td>UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2736 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Account_Num  Provider_Num  Score  \\\n",
       "2660        94995        825914    3.0   \n",
       "1975        94953        819181    3.0   \n",
       "1042        94943        680596    3.0   \n",
       "2305        94923        403151    3.0   \n",
       "2512        94887        752284    2.0   \n",
       "...           ...           ...    ...   \n",
       "2080        10165        188247    3.0   \n",
       "1825        10090        212069    3.0   \n",
       "2424        10043        140535    3.0   \n",
       "1959        10020        210657    3.0   \n",
       "1958        10020        121670    2.0   \n",
       "\n",
       "                                       Acct_Name_Lookup  \\\n",
       "2660  CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...   \n",
       "1975  LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...   \n",
       "1042  VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...   \n",
       "2305  UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...   \n",
       "2512  NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI...   \n",
       "...                                                 ...   \n",
       "2080  UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT   \n",
       "1825  CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...   \n",
       "2424  BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...   \n",
       "1959  ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...   \n",
       "1958  ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...   \n",
       "\n",
       "                              Reimbursement_Name_Lookup  \n",
       "2660  CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...  \n",
       "1975  LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...  \n",
       "1042  VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...  \n",
       "2305  UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...  \n",
       "2512  NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B...  \n",
       "...                                                 ...  \n",
       "2080  UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT  \n",
       "1825  CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...  \n",
       "2424  BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...  \n",
       "1959  ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...  \n",
       "1958  UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M...  \n",
       "\n",
       "[2736 rows x 5 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_merge[cols].sort_values(by=['Account_Num', 'Score'], ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If you need to save it to Excel -\n",
    "#final_merge.sort_values(by=['Account_Num', 'Score'],\n",
    "#                        ascending=False).to_excel('merge_list.xlsx',\n",
    "#                                                  index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Score</th>\n",
       "      <th>Acct_Name_Lookup</th>\n",
       "      <th>Reimbursement_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2302</th>\n",
       "      <td>11035</td>\n",
       "      <td>550921</td>\n",
       "      <td>2.0</td>\n",
       "      <td>SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO</td>\n",
       "      <td>PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2303</th>\n",
       "      <td>11035</td>\n",
       "      <td>706807</td>\n",
       "      <td>3.0</td>\n",
       "      <td>SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO</td>\n",
       "      <td>SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Account_Num  Provider_Num  Score  \\\n",
       "2302        11035        550921    2.0   \n",
       "2303        11035        706807    3.0   \n",
       "\n",
       "                                     Acct_Name_Lookup  \\\n",
       "2302  SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO   \n",
       "2303  SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO   \n",
       "\n",
       "                              Reimbursement_Name_Lookup  \n",
       "2302  PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19...  \n",
       "2303  SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE...  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_merge[final_merge['Account_Num']==11035][cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num</th>\n",
       "      <th>Provider_Num</th>\n",
       "      <th>Score</th>\n",
       "      <th>Acct_Name_Lookup</th>\n",
       "      <th>Reimbursement_Name_Lookup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>56375</td>\n",
       "      <td>390402</td>\n",
       "      <td>2.0</td>\n",
       "      <td>BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...</td>\n",
       "      <td>HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156</th>\n",
       "      <td>56375</td>\n",
       "      <td>451229</td>\n",
       "      <td>3.0</td>\n",
       "      <td>BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...</td>\n",
       "      <td>BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Account_Num  Provider_Num  Score  \\\n",
       "155        56375        390402    2.0   \n",
       "156        56375        451229    3.0   \n",
       "\n",
       "                                      Acct_Name_Lookup  \\\n",
       "155  BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...   \n",
       "156  BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...   \n",
       "\n",
       "                             Reimbursement_Name_Lookup  \n",
       "155  HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA...  \n",
       "156  BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_merge[final_merge['Account_Num']==56375][cols]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dedupe the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "hospital_dupes = pd.read_csv(\n",
    "    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_dupes.csv',\n",
    "    index_col='Account_Num')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP Code</th>\n",
       "      <th>County Name</th>\n",
       "      <th>Phone Number</th>\n",
       "      <th>Hospital Type</th>\n",
       "      <th>Hospital Ownership</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Account_Num</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>71730</th>\n",
       "      <td>SAGE MEMORIAL HOSPITAL</td>\n",
       "      <td>STATE ROUTE 264 SOUTH 191</td>\n",
       "      <td>GANADO</td>\n",
       "      <td>AZ</td>\n",
       "      <td>86505</td>\n",
       "      <td>APACHE</td>\n",
       "      <td>(928) 755-4541</td>\n",
       "      <td>Critical Access Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70116</th>\n",
       "      <td>WOODRIDGE BEHAVIORAL CENTER</td>\n",
       "      <td>600 NORTH 7TH STREET</td>\n",
       "      <td>WEST MEMPHIS</td>\n",
       "      <td>AR</td>\n",
       "      <td>72301</td>\n",
       "      <td>CRITTENDEN</td>\n",
       "      <td>(870) 394-4113</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87991</th>\n",
       "      <td>DOUGLAS GARDENS HOSPITAL</td>\n",
       "      <td>5200 NE 2ND AVE</td>\n",
       "      <td>MIAMI</td>\n",
       "      <td>FL</td>\n",
       "      <td>33137</td>\n",
       "      <td>MIAMI-DADE</td>\n",
       "      <td>(305) 751-8626</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Voluntary non-profit - Private</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22662</th>\n",
       "      <td>SUNCOAST BEHAVIORAL HEALTH CENTER</td>\n",
       "      <td>4480 51ST ST W</td>\n",
       "      <td>BRADENTON</td>\n",
       "      <td>FL</td>\n",
       "      <td>34210</td>\n",
       "      <td>MANATEE</td>\n",
       "      <td>(941) 792-2222</td>\n",
       "      <td>Psychiatric</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63165</th>\n",
       "      <td>TREASURE VALLEY HOSPITAL</td>\n",
       "      <td>8800 WEST EMERALD STREET</td>\n",
       "      <td>BOISE</td>\n",
       "      <td>ID</td>\n",
       "      <td>83704</td>\n",
       "      <td>ADA</td>\n",
       "      <td>(208) 373-5000</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 Facility Name                    Address  \\\n",
       "Account_Num                                                                 \n",
       "71730                   SAGE MEMORIAL HOSPITAL  STATE ROUTE 264 SOUTH 191   \n",
       "70116              WOODRIDGE BEHAVIORAL CENTER       600 NORTH 7TH STREET   \n",
       "87991                 DOUGLAS GARDENS HOSPITAL            5200 NE 2ND AVE   \n",
       "22662        SUNCOAST BEHAVIORAL HEALTH CENTER             4480 51ST ST W   \n",
       "63165                 TREASURE VALLEY HOSPITAL   8800 WEST EMERALD STREET   \n",
       "\n",
       "                     City State  ZIP Code County Name    Phone Number  \\\n",
       "Account_Num                                                             \n",
       "71730              GANADO    AZ     86505      APACHE  (928) 755-4541   \n",
       "70116        WEST MEMPHIS    AR     72301  CRITTENDEN  (870) 394-4113   \n",
       "87991               MIAMI    FL     33137  MIAMI-DADE  (305) 751-8626   \n",
       "22662           BRADENTON    FL     34210     MANATEE  (941) 792-2222   \n",
       "63165               BOISE    ID     83704         ADA  (208) 373-5000   \n",
       "\n",
       "                         Hospital Type              Hospital Ownership  \n",
       "Account_Num                                                             \n",
       "71730        Critical Access Hospitals  Voluntary non-profit - Private  \n",
       "70116                      Psychiatric                     Proprietary  \n",
       "87991             Acute Care Hospitals  Voluntary non-profit - Private  \n",
       "22662                      Psychiatric                     Proprietary  \n",
       "63165             Acute Care Hospitals                     Proprietary  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospital_dupes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Deduping follows the same process, you just use 1 single dataframe\n",
    "dupe_indexer = recordlinkage.Index()\n",
    "dupe_indexer.sortedneighbourhood(left_on='State')\n",
    "dupe_candidate_links = dupe_indexer.index(hospital_dupes)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Comparison step\n",
    "compare_dupes = recordlinkage.Compare()\n",
    "compare_dupes.string('City', 'City', threshold=0.85, label='City')\n",
    "compare_dupes.string('Phone Number',\n",
    "                     'Phone Number',\n",
    "                     threshold=0.85,\n",
    "                     label='Phone_Num')\n",
    "compare_dupes.string('Facility Name',\n",
    "                     'Facility Name',\n",
    "                     threshold=0.80,\n",
    "                     label='Hosp_Name')\n",
    "compare_dupes.string('Address',\n",
    "                     'Address',\n",
    "                     threshold=0.85,\n",
    "                     label='Hosp_Address')\n",
    "dupe_features = compare_dupes.compute(dupe_candidate_links, hospital_dupes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Phone_Num</th>\n",
       "      <th>Hosp_Name</th>\n",
       "      <th>Hosp_Address</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Account_Num_1</th>\n",
       "      <th>Account_Num_2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">26270</th>\n",
       "      <th>28485</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30430</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43602</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">59585</th>\n",
       "      <th>28485</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30430</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">64029</th>\n",
       "      <th>38600</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35413</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81525</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82916</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18907</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>981277 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                             City  Phone_Num  Hosp_Name  Hosp_Address\n",
       "Account_Num_1 Account_Num_2                                          \n",
       "26270         28485           0.0        0.0        0.0           0.0\n",
       "              30430           0.0        0.0        0.0           0.0\n",
       "              43602           0.0        0.0        0.0           0.0\n",
       "59585         28485           0.0        0.0        0.0           0.0\n",
       "              30430           0.0        0.0        0.0           0.0\n",
       "...                           ...        ...        ...           ...\n",
       "64029         38600           0.0        0.0        0.0           0.0\n",
       "              35413           0.0        0.0        0.0           0.0\n",
       "              81525           0.0        0.0        0.0           0.0\n",
       "              82916           0.0        0.0        0.0           0.0\n",
       "              18907           0.0        0.0        0.0           0.0\n",
       "\n",
       "[981277 rows x 4 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dupe_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.0         7\n",
       "2.0       206\n",
       "1.0      7859\n",
       "0.0    973205\n",
       "dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dupe_features.sum(axis=1).value_counts().sort_index(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "potential_dupes = dupe_features[dupe_features.sum(axis=1) > 2].reset_index()\n",
    "potential_dupes['Score'] = potential_dupes.loc[:, 'City':'Hosp_Address'].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account_Num_1</th>\n",
       "      <th>Account_Num_2</th>\n",
       "      <th>City</th>\n",
       "      <th>Phone_Num</th>\n",
       "      <th>Hosp_Name</th>\n",
       "      <th>Hosp_Address</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>28494</td>\n",
       "      <td>37949</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>74835</td>\n",
       "      <td>77000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>24549</td>\n",
       "      <td>28485</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>70366</td>\n",
       "      <td>52654</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>61685</td>\n",
       "      <td>24849</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>51567</td>\n",
       "      <td>41166</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>26495</td>\n",
       "      <td>41079</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Account_Num_1  Account_Num_2  City  Phone_Num  Hosp_Name  Hosp_Address  \\\n",
       "0          28494          37949   1.0        1.0        0.0           1.0   \n",
       "1          74835          77000   1.0        1.0        0.0           1.0   \n",
       "2          24549          28485   1.0        1.0        0.0           1.0   \n",
       "3          70366          52654   1.0        1.0        0.0           1.0   \n",
       "4          61685          24849   1.0        1.0        0.0           1.0   \n",
       "5          51567          41166   1.0        1.0        1.0           0.0   \n",
       "6          26495          41079   1.0        1.0        0.0           1.0   \n",
       "\n",
       "   Score  \n",
       "0    3.0  \n",
       "1    3.0  \n",
       "2    3.0  \n",
       "3    3.0  \n",
       "4    3.0  \n",
       "5    3.0  \n",
       "6    3.0  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "potential_dupes.sort_values(by=['Score'], ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Facility Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>ZIP Code</th>\n",
       "      <th>County Name</th>\n",
       "      <th>Phone Number</th>\n",
       "      <th>Hospital Type</th>\n",
       "      <th>Hospital Ownership</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Account_Num</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>41166</th>\n",
       "      <td>ST VINCENT HOSPITAL</td>\n",
       "      <td>835 S VAN BUREN ST</td>\n",
       "      <td>GREEN BAY</td>\n",
       "      <td>WI</td>\n",
       "      <td>54301</td>\n",
       "      <td>BROWN</td>\n",
       "      <td>(920) 433-0111</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Voluntary non-profit - Church</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51567</th>\n",
       "      <td>SAINT VINCENT HOSPITAL</td>\n",
       "      <td>835 SOUTH VAN BUREN ST</td>\n",
       "      <td>GREEN BAY</td>\n",
       "      <td>WI</td>\n",
       "      <td>54301</td>\n",
       "      <td>BROWN</td>\n",
       "      <td>(920) 433-0112</td>\n",
       "      <td>Acute Care Hospitals</td>\n",
       "      <td>Voluntary non-profit - Church</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Facility Name                 Address       City State  \\\n",
       "Account_Num                                                                    \n",
       "41166           ST VINCENT HOSPITAL      835 S VAN BUREN ST  GREEN BAY    WI   \n",
       "51567        SAINT VINCENT HOSPITAL  835 SOUTH VAN BUREN ST  GREEN BAY    WI   \n",
       "\n",
       "             ZIP Code County Name    Phone Number         Hospital Type  \\\n",
       "Account_Num                                                               \n",
       "41166           54301       BROWN  (920) 433-0111  Acute Care Hospitals   \n",
       "51567           54301       BROWN  (920) 433-0112  Acute Care Hospitals   \n",
       "\n",
       "                        Hospital Ownership  \n",
       "Account_Num                                 \n",
       "41166        Voluntary non-profit - Church  \n",
       "51567        Voluntary non-profit - Church  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Take a look at one of the potential duplicates\n",
    "hospital_dupes[hospital_dupes.index.isin([51567, 41166])]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.2 64-bit",
   "language": "python",
   "name": "python37264bit24e6adcd4e9b4d8bb6ff9239d4d45105"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
